# General-purpose data wrangling
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Graphing
library(ggplot2)
# Date manipulation
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Scales manipulation
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
break_log_cust <- function(range) {
  breaks_custom <- c()
  for (x in range) {
    for (i in 1:9) {
      breaks_custom <- c(breaks_custom, i*10^x)
    }
  }
  return(breaks_custom)
}
datasets <- read_rds("../data/datasets/epsrc_grant_datasets.rds")

datasets
## $person_roles
## # A tibble: 73,218 x 4
##    person_role_id                       grant_ref    role       person_id       
##    <chr>                                <fct>        <fct>      <fct>           
##  1 578F78BD-E807-4C27-B638-DF6A80C58264 EP/G026440/1 PRINCIPAL~ 390D96F7-5924-4~
##  2 F4682871-ED94-443E-B640-46F573504687 EP/I006591/1 PRINCIPAL~ 0A4952F9-DDA2-4~
##  3 3BC8723D-5584-4657-ABC4-E61F0DBEDAFA EP/F032145/1 PRINCIPAL~ AF612336-CBA1-4~
##  4 540DABDB-9231-4101-9E7C-28128D884EA5 EP/F032145/1 CO_INVEST~ 690F821C-1EDD-4~
##  5 4DA004EF-8282-43D6-B5DE-7AEE47E3FF23 EP/K012258/1 PRINCIPAL~ 1DB55081-3907-4~
##  6 AD7D0F55-3621-46BF-8B5A-C6FF630991C1 EP/K012258/1 CO_INVEST~ 6024802D-8803-4~
##  7 C3D97C2E-0EE6-497B-B5AD-6430FD74E662 EP/K012258/1 CO_INVEST~ 8E16681F-5AB4-4~
##  8 406F342D-8747-4542-97F3-25551E437A0F EP/K012258/1 CO_INVEST~ 65FE98F7-4153-4~
##  9 0F2B8601-42C9-4335-B91C-D063F59FE7F3 EP/S513817/1 TRAINING_~ 6D1727F8-71D0-4~
## 10 9FDF33D0-8310-4945-9274-09AECC73EAE1 EP/H023909/1 CO_INVEST~ A3D04976-A9D9-4~
## # ... with 73,208 more rows
## 
## $people
## # A tibble: 31,261 x 4
##    person_id                            first_name other_names surname      
##    <chr>                                <chr>      <chr>       <chr>        
##  1 390D96F7-5924-476A-B7E3-E87694AA1FE0 tanniemola bunting     liverpool    
##  2 0A4952F9-DDA2-4767-833F-3E4B5BE2F878 mike       <NA>        gregory      
##  3 AF612336-CBA1-4927-829C-883C19D94EA0 hua        <NA>        dong         
##  4 690F821C-1EDD-45DF-AD28-85CA4A037985 robert     <NA>        macredie     
##  5 1DB55081-3907-4397-98C1-3E6CBA25FD3E robin      bruce       bedford      
##  6 6024802D-8803-4519-89B4-40D5B4361A2F david      j           fermin       
##  7 8E16681F-5AB4-4AEC-B1E8-40ED46301A01 guy        <NA>        lloyd-jones  
##  8 65FE98F7-4153-470B-A575-E03BBFA2CCDC jeremy     noel        harvey       
##  9 6D1727F8-71D0-42BB-BDE1-4ED8BA097323 elizabeth  <NA>        towns-andrews
## 10 A3D04976-A9D9-4851-8D8D-26115BFE0BC6 julie      <NA>        macpherson   
## # ... with 31,251 more rows
## 
## $name_gender
## # A tibble: 6,823 x 3
##    name      proportion_male count
##    <chr>               <dbl> <dbl>
##  1 a                  0.68   19908
##  2 aad                0.95     863
##  3 aaditya            1        100
##  4 aakanksha          0.0400    49
##  5 aakeen            NA         NA
##  6 aakifah            0          6
##  7 aaliyah            0.0100   673
##  8 aamal              0.0700    15
##  9 aamina             0.0600    32
## 10 aamna              0.0400    46
## # ... with 6,813 more rows
## 
## $organisation_roles
## # A tibble: 93,571 x 4
##    org_role_id                          grant_ref    org_role        org_id     
##    <chr>                                <fct>        <fct>           <fct>      
##  1 CD22E167-9295-49C1-A774-788E7EACA160 EP/G026440/1 LEAD_RO         4A348A76-B~
##  2 97907792-9F95-421F-A9D1-8D813DB6A719 EP/G026440/1 COLLABORATOR    3EAE04CA-9~
##  3 7F9E4C86-998D-474A-88C3-84CC45779C04 EP/G026440/1 COLLABORATOR    46B41008-0~
##  4 5203030E-E220-4F5E-A0CB-C120F750D7FB EP/G026440/1 COLLABORATOR    7DD38B9D-A~
##  5 827544D6-1434-490B-B7DF-A4F6C1AB5A44 EP/G026440/1 PROJECT_PARTNER CAC1288E-6~
##  6 2FA832A7-FE28-4EBA-9865-DA0F19D7DDE4 EP/I006591/1 LEAD_RO         D1774113-D~
##  7 5CB8DA76-7BEC-4253-9116-284468B8CBF2 EP/F032145/1 LEAD_RO         BECA2763-0~
##  8 1B9CC61F-1154-426F-B88A-7A95494AEBD1 EP/F032145/1 PROJECT_PARTNER 87ED9DCB-0~
##  9 8CC6B9E1-B193-4D9E-85AE-C6637D1F40B4 EP/F032145/1 PROJECT_PARTNER 532931BB-1~
## 10 F160A4AF-E424-439C-AB9C-C5FFFD9FF12A EP/F032145/1 PROJECT_PARTNER 08D2B6FC-E~
## # ... with 93,561 more rows
## 
## $organisations
## # A tibble: 16,395 x 12
##    org_id   org_name  org_url      address_line1 address_post_co~ address_region
##    <chr>    <chr>     <chr>        <chr>         <chr>            <chr>         
##  1 4A348A7~ Universi~ https://gtr~ 5 Tyndall Av~ BS8 1UD          South West    
##  2 3EAE04C~ Universi~ https://gtr~ University C~ OX1 2JD          South East    
##  3 46B4100~ Durham U~ https://gtr~ Palatine Cen~ DH1 3LE          Unknown       
##  4 7DD38B9~ Tohoku U~ https://gtr~ <NA>          <NA>             <NA>          
##  5 CAC1288~ Syracuse~ https://gtr~ Syracuse Uni~ NY 13244-4       Outside UK    
##  6 D177411~ Universi~ https://gtr~ Lensfield Ro~ CB2 1EW          Unknown       
##  7 BECA276~ Brunel U~ https://gtr~ Kingston Lane UB8 3PH          Unknown       
##  8 87ED9DC~ Wright D~ https://gtr~ 1a Norwich S~ CB2 1ND          East of Engla~
##  9 532931B~ Ricabili~ https://gtr~ Unit 10, Ble~ N7 9NY           Unknown       
## 10 08D2B6F~ Granta D~ https://gtr~ Rustat House  CB1 7EG          East of Engla~
## # ... with 16,385 more rows, and 6 more variables: address_country <chr>,
## #   address_line2 <chr>, address_line4 <chr>, address_line5 <chr>,
## #   address_line3 <chr>, address_city <chr>
## 
## $subject_weight
## # A tibble: 25,082 x 4
##    subject_weight_id                    grant_ref    subject_id       percentage
##    <chr>                                <fct>        <fct>                 <dbl>
##  1 D666A364-FF4E-4CEA-83D6-C85CDF5496DD EP/G026440/1 17F21DBC-4946-4~         50
##  2 4DB01636-14E2-4E0D-8EAE-4F8971C3DFD5 EP/G026440/1 E847DD9B-2053-4~         50
##  3 0E58470D-3A0D-42EE-AAD3-86BD4EC2E8DE EP/I006591/1 5E9AA4EC-49E3-4~        100
##  4 ACE2E454-919D-43AA-9F88-5991B5BA1CEA EP/F032145/1 5E9AA4EC-49E3-4~        100
##  5 5CEE216E-4132-42E3-B504-D856E5B0F21E EP/K012258/1 64596636-AE88-4~        100
##  6 AFA8A43B-F969-4440-9EA4-C6BA3E49E607 EP/H023909/1 64596636-AE88-4~         40
##  7 F13F0FF9-A847-4FCC-840B-AD92E5C91A86 EP/H023909/1 E847DD9B-2053-4~         60
##  8 598878F7-41CC-4B16-B59A-9ECD57468347 EP/I004246/1 EB5F16BB-2772-4~        100
##  9 43299E65-463F-492F-82C4-705183935851 EP/H049746/1 772CD758-53CD-4~        100
## 10 68E2C54B-7097-4E2E-94D1-CAFD865A1A7F EP/K007777/1 F95121C7-7C31-4~        100
## # ... with 25,072 more rows
## 
## $subjects
## # A tibble: 74 x 2
##    subject_id                           subject                       
##    <chr>                                <fct>                         
##  1 17F21DBC-4946-4262-AD07-9914F6EA12F7 Bioengineering                
##  2 E847DD9B-2053-4BC7-97A9-4213B185C968 Materials sciences            
##  3 5E9AA4EC-49E3-4D6A-B545-79DA07CE39E0 Management & Business Studies 
##  4 64596636-AE88-4F6E-A816-9C4C203E1197 Catalysis & surfaces          
##  5 EB5F16BB-2772-4DDE-BD6C-3B7A6914B64C Info. & commun. Technol.      
##  6 772CD758-53CD-407F-9B2C-F2B861E86155 Mechanical Engineering        
##  7 F95121C7-7C31-40CE-BDBC-1E2BF025AEA6 Energy                        
##  8 1A1A6805-9DC4-4BCE-BC70-9F2AA4FD093B Ecol, biodivers. & systematics
##  9 DEA11FBC-BEED-4EDD-890B-97D728462D26 Mathematical sciences         
## 10 FB535BD0-E265-4C0A-8532-32DCB83A3951 Tools, technologies & methods 
## # ... with 64 more rows
## 
## $topic_weight
## # A tibble: 45,439 x 4
##    topic_weight_id                      grant_ref    topic_id         percentage
##    <chr>                                <fct>        <fct>                 <dbl>
##  1 FBB8A358-AA0E-41CE-9D63-67D1CE5DB753 EP/G026440/1 BD108981-2A72-4~         50
##  2 AE70E5D8-B282-425B-92D3-84D0C712FC84 EP/G026440/1 561091A1-9FC9-4~         25
##  3 D5407096-419B-4ABC-917E-5FDEA1040371 EP/G026440/1 063026C5-CD0B-4~         25
##  4 9D8FA88E-9FEA-42EC-8D1A-C1F4300E8FCB EP/I006591/1 6A482B09-D749-4~        100
##  5 E1965173-A691-4FEB-9382-3BAB2074839B EP/F032145/1 6A482B09-D749-4~        100
##  6 EED7986B-06E6-4D50-A1AD-4CBBAAE122F8 EP/K012258/1 35E9CECD-F211-4~        100
##  7 2CFEAB18-BEA8-4B28-9DAD-D9BDE6E4B2A0 EP/S513817/1 D05BC2E0-0345-4~         NA
##  8 E9F1D628-29CC-4FB8-921A-9E1F4934E2AC EP/H023909/1 561091A1-9FC9-4~         30
##  9 1AE1739D-5F9B-48F6-91C2-0831DCBBBCF9 EP/H023909/1 063026C5-CD0B-4~         30
## 10 2EDB060D-4FF5-4327-BD53-C22526FE1585 EP/H023909/1 0C89BB7D-DCEF-4~         40
## # ... with 45,429 more rows
## 
## $topics
## # A tibble: 343 x 2
##    topic_id                             topic                        
##    <chr>                                <fct>                        
##  1 BD108981-2A72-4C45-AAC2-425461EB90BE Biomaterials                 
##  2 561091A1-9FC9-4508-B1B2-2F3623E1FC9D Materials Characterisation   
##  3 063026C5-CD0B-48AB-82F3-6359499A95D4 Materials Synthesis & Growth 
##  4 6A482B09-D749-4443-8AF0-654984FCE91D Manufact. Business Strategy  
##  5 35E9CECD-F211-4914-9F91-4B8FB76B773D Catalysis & Applied Catalysis
##  6 D05BC2E0-0345-4A3F-8C3F-775BC42A0819 Unclassified                 
##  7 0C89BB7D-DCEF-4DCF-84DB-CA53C907AB0E Surfaces & Interfaces        
##  8 76783275-A9F8-4B4E-B314-51363124259C Fundamentals of Computing    
##  9 0A982A4A-12CF-4734-AFCA-A5DC61F667F3 Information & Knowledge Mgmt 
## 10 ED6338AE-3457-45D6-90CA-B994C3CF422B Aerodynamics                 
## # ... with 333 more rows
## 
## $grant_themes
## # A tibble: 8,918 x 3
##    grant_theme_id                       grant_ref    theme_id                   
##    <chr>                                <fct>        <fct>                      
##  1 02bf5930-df87-41e5-86fa-7d66659591f2 EP/V028855/1 ea880e85-f413-4373-98af-42~
##  2 a90cadd6-3977-40be-a687-857635e90c48 EP/V05385X/1 ea880e85-f413-4373-98af-42~
##  3 a6b639ed-70d7-4e7a-a8ce-f980a9164060 EP/V00817X/1 ea880e85-f413-4373-98af-42~
##  4 e75a7ed0-726c-462e-a15f-3e8f2ea719e5 EP/V029975/1 ea880e85-f413-4373-98af-42~
##  5 4270f864-77a4-48c4-a2a7-518574e3b50f EP/T012595/1 ea880e85-f413-4373-98af-42~
##  6 b3add47a-4a72-484e-87e8-6758e71063f7 EP/V007785/1 ea880e85-f413-4373-98af-42~
##  7 7d2954c4-d219-4009-a33f-18f73ef08025 EP/V007750/1 ea880e85-f413-4373-98af-42~
##  8 e5f314ea-129d-44dd-b36e-f135a6123412 EP/V007742/1 ea880e85-f413-4373-98af-42~
##  9 20131c65-ad81-463f-8ccc-78c9a2c76dec EP/V012126/1 ea880e85-f413-4373-98af-42~
## 10 9be9bf53-3cac-40ba-af32-44d4504a6b18 EP/V012037/1 ea880e85-f413-4373-98af-42~
## # ... with 8,908 more rows
## 
## $themes
## # A tibble: 638 x 4
##    theme_id                             theme_type theme_name  theme_research_a~
##    <chr>                                <fct>      <fct>       <fct>            
##  1 ea880e85-f413-4373-98af-428a6a52e465 Challenge  Engineering Analytical Scien~
##  2 645516da-c627-4ea8-b057-d35168d65986 Challenge  Engineering Architecture and~
##  3 6c704d32-3a2a-462e-a1ae-916fc7506c5e Challenge  Engineering Artificial Intel~
##  4 41fbadd0-19b8-48de-a012-fe2a8401354d Challenge  Engineering Assistive Techno~
##  5 85001c51-b6ce-4e0a-b6a4-d637852be41e Challenge  Engineering Bioenergy        
##  6 096cb244-b815-4adc-bc4e-c9a216d6f307 Challenge  Engineering Biological Infor~
##  7 d1fe47c3-a1bf-4b34-b804-4d574d859e9b Challenge  Engineering Biomaterials and~
##  8 34891f64-93be-4900-9092-5ab9f4649b36 Challenge  Engineering Biophysics and S~
##  9 c7cd59fc-7e7c-45d9-a706-aeca4cb6360b Challenge  Engineering Built Environment
## 10 0fbf5993-46c7-471d-a33c-a51d08e40012 Challenge  Engineering Carbon Capture a~
## # ... with 628 more rows
## 
## $grants
## # A tibble: 29,805 x 11
##    grant_ref    grant_id grant_title grant_abstract lead_ro_departm~ grant_category
##    <chr>        <chr>    <chr>       <chr>          <chr>            <fct>         
##  1 EP/G026440/1 66B9C31~ NSF Materi~ "This proposa~ Mathematics      Research Grant
##  2 EP/I006591/1 67133CB~ UK-Japan I~ "The aim of t~ Engineering      Research Grant
##  3 EP/F032145/1 68679F7~ Facilitati~ "Inclusive de~ Sch of Engineer~ Research Grant
##  4 EP/K012258/1 8D1B2AE~ The Ironwo~ "The formatio~ Chemistry        Research Grant
##  5 EP/S513817/1 BFED96A~ Industrial~ "Doctoral Tra~ Research and En~ Training Grant
##  6 EP/H023909/1 C18E548~ Uncovering~ "Electrochemi~ Chemistry        Research Grant
##  7 EP/I004246/1 8AF8AD2~ Foundation~ "Many importa~ Dept of Computi~ Fellowship    
##  8 EP/H049746/1 8B040A0~ Equipment ~ "Aerospace is~ Div of Material~ Research Grant
##  9 EP/P504546/1 407EE17~ DHPA - Kee~ "Doctoral Tra~ Research Develo~ Training Grant
## 10 EP/N509346/1 175AFA1~ Industrial~ "Doctoral Tra~ Research Finance Training Grant
## # ... with 29,795 more rows, and 5 more variables: start_date <date>,
## #   end_date <date>, award_value <dbl>, region <fct>, project_open <lgl>

How has the amount of funding awarded changed over time?

amount_start_date <- datasets$grants %>% 
  select(
    start_date,
    award_value
  )
amount_start_year <- amount_start_date %>% 
  mutate(
    start_year = as_factor(year(start_date))
  ) %>% 
  group_by(
    start_year
  ) %>% 
  summarise(
    mean_award_value = mean(award_value),
    total_award_value = sum(award_value)
  )

amount_start_year
amount_start_month <- amount_start_date %>% 
  mutate(
    start_month = as_factor(month(start_date, label = TRUE, abbr = TRUE))
  ) %>% 
  group_by(
    start_month
  ) %>% 
  summarise(
    mean_award_value = mean(award_value),
    total_award_value = sum(award_value)
  )

amount_start_month
amount_start_year_month <- amount_start_date %>% 
  mutate(
    start_year = as_factor(year(start_date)),
    start_month = month(start_date, label = TRUE, abbr = TRUE),
    # start_quater = as_factor(quarter(start_date))
  ) %>% 
  group_by(
    start_year,
    start_month,
    # start_quater
  ) %>% 
  summarise(
    mean_award_value = mean(award_value),
    total_award_value = sum(award_value)
  )
## `summarise()` has grouped output by 'start_year'. You can override using the `.groups` argument.
amount_start_year_month

Average Award Amount

amount_start_year %>% 
  ggplot(
    mapping = aes(
      x = start_year,
      y = mean_award_value,
      fill = start_year
    )
  ) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(
    name = "Mean Award Value",
    labels = label_dollar(prefix = "£", suffix = ""),
    breaks = extended_breaks(8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_x_discrete(
    name = "Start Year"
  ) +
  scale_fill_discrete() 

amount_start_month %>% 
  ggplot(
    mapping = aes(
      x = start_month,
      y = mean_award_value,
      group = start_month,
      # fill = hue_pal()(12)
      fill = start_month
    )
  ) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(
    name = "Mean Award Value",
    labels = label_dollar(prefix = "£", suffix = ""),
    breaks = extended_breaks(8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_x_discrete(
    name = "Start Month"
  ) +
  scale_fill_discrete()

amount_start_year_month %>% 
  ggplot() +
  geom_line(
    mapping = aes(
      x = start_year,
      y = mean_award_value,
      group = start_month,
      colour = start_month
    ),
    size = 1
  ) +
  scale_y_continuous(
    name = "Mean Award Value",
    labels = label_dollar(prefix = "£", suffix = ""),
    # breaks = extended_breaks(8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_x_discrete(
    name = "Start Year"
  ) +
  scale_color_discrete(
    name = "Start Month"
  ) + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5)
  )

Total Award Amount

amount_start_year %>% 
  ggplot(
    mapping = aes(
      x = start_year,
      y = total_award_value,
      fill = start_year
    )
  ) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(
    name = "Total Award Value",
    labels = label_dollar(prefix = "£", suffix = ""),
    breaks = extended_breaks(8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_x_discrete(
    name = "Start Year"
  )

amount_start_month %>% 
  ggplot(
    mapping = aes(
      x = start_month,
      y = total_award_value,
      group = start_month,
      fill = start_month
    )
  ) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(
    name = "Total Award Value",
    labels = label_dollar(prefix = "£", suffix = ""),
    breaks = extended_breaks(8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_x_discrete(
    name = "Start Month"
  ) +
  scale_fill_discrete()

amount_start_year_month %>% 
  ggplot(
    mapping = aes(
      x = start_year,
      y = total_award_value,
      group = start_month,
      col = start_month
    )
  ) +
  geom_line(size = 1)  +
  scale_y_continuous(
    name = "Total Award Value",
    labels = label_dollar(prefix = "£", suffix = ""),
    breaks = extended_breaks(8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_x_discrete(
    name = "Start Year",
  ) +
  scale_color_discrete(
    name = "Start Month"
  ) +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5)
  )

## Distribtuion of Award Value

amount_start_date %>% 
  mutate(
    start_year = as_factor(year(start_date))
  ) %>% 
  group_by(start_year) %>% 
  # filter(award_value != 0) %>% 
  ggplot() +
  geom_boxplot(
    mapping = aes(
      x = start_year,
      y = award_value,
      group = start_year,
      colour = start_year
    ),
    # outlier.shape = NA,
  ) +
  scale_y_continuous(
    name = "Award Value",
    labels = label_dollar(prefix = "£"),
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
    expand = expansion(mult = c(0, .05))
  ) +
  theme(
    legend.position = "none"
  ) +
  labs(
    title = "Yearly Distribution of Award Value - Zero Award Values excluded"
  )

amount_start_date %>% 
  mutate(
    start_year = as_factor(year(start_date))
  ) %>% 
  group_by(start_year) %>% 
  filter(award_value != 0) %>%
  ggplot() +
  geom_boxplot(
    mapping = aes(
      x = start_year,
      y = award_value,
      group = start_year,
      colour = start_year
    ),
    # outlier.shape = NA,
  ) +
  scale_y_continuous(
    name = "Award Value",
    labels = label_dollar(prefix = "£"),
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
    expand = expansion(mult = c(0, .05))
  ) +
  theme(
    legend.position = "none"
  ) +
  scale_x_discrete(
    name = "Start Year"
  ) +
  labs(
    title = "Yearly Distribution of Award Value - Zero Award Values excluded"
  )

Distribution of Award Amount

datasets$grants %>% 
  ggplot() +
  geom_histogram(
    aes(x = award_value),
    bins = 75,
    fill = hue_pal()(1)
  ) +
  scale_x_continuous(
    name = "Award Value",
    labels = label_dollar(prefix = "£"),
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
    minor_breaks = break_log_cust(0:8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

datasets$grants %>% 
  filter(award_value != 0) %>% 
  ggplot() +
  geom_histogram(
    mapping = aes(
      x = award_value
    ),
    bins = 75,
    fill = hue_pal()(1)
  ) +
  scale_x_log10(
    name = "Award Value",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_log(6),
    minor_breaks = break_log_cust(3:8),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  theme(
    legend.position = "none"
  )

# Lead Research Organisations and Grant Funding

What is the distribution of funding provided to Lead Research Organisations?

org_roles_joined <- datasets$organisation_roles %>% left_join(datasets$organisations, by = "org_id")
  
grants_x_orgs <- datasets$grants %>% left_join(org_roles_joined, by = "grant_ref")

org_funding <- grants_x_orgs %>% 
  select(
    grant_ref,
    award_value,
    org_role,
    org_name
  )

org_funding
summary_org <- org_funding %>% 
  group_by(org_name, org_role) %>% 
  summarise(
    total_award_amount = sum(award_value),
    mean_award_amount = mean(award_value),
    number_of_grants = n()
  )
## `summarise()` has grouped output by 'org_name'. You can override using the `.groups` argument.
summary_org
summary_org %>% 
  filter(org_role == "LEAD_RO") %>% 
  ggplot() +
  geom_density(
    aes(
      x = total_award_amount,
      ..count..
    ),
    # stat = "bin",
    # bins = 40,
    col = hue_pal()(3)[1],
    # fill = hue_pal()(1)
    size = 0.75
  ) +
  scale_x_log10(
    name = "Total Amount Awarded",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_log(6),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  labs(
    title = "Distribution of Total Amount Awarded to Lead Reasearch Organisations"
  )

summary_org %>% 
  filter(org_role == "LEAD_RO") %>% 
  ggplot() +
  geom_density(
    aes(
      x = mean_award_amount,
      ..count..
    ),
    # stat = "bin",
    # bins = 40,
    col = hue_pal()(3)[2],
    # fill = hue_pal()(1)
    size = 0.75
  ) +
  scale_x_log10(
    name = "Mean Amount Awarded",
    labels = label_dollar(prefix = "£"),
    minor_breaks = break_log_cust(3:7),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  labs(
    title = "Distribution of Mean Amount Awarded to Lead Reasearch Organisations"
  )

summary_org %>% 
  filter(org_role == "LEAD_RO") %>% 
  ggplot() +
  geom_density(
    aes(
      x = number_of_grants,
      ..count..
    ),
    # stat = "bin",
    # bins = 40,
    col = hue_pal()(3)[3],
    # fill = hue_pal()(1)
    size = 0.75
  ) +
  scale_x_log10(
    name = "Total Number of Grants Awarded",
    minor_breaks = break_log_cust(0:3),
    expand = expansion(mult = c(0, .05))
  ) +
  # scale_x_continuous(
  #   name = "Total Number of Grants Awarded",
  #   breaks = breaks_extended(10),
  # ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  labs(
    title = "Distribution of Total Number of Grants Awarded to Lead Reasearch Organisations"
  )

# Which Lead Research Organisations receive the most funding?

summary_org %>% 
  filter(org_role == "LEAD_RO") %>% 
  arrange(desc(total_award_amount)) %>% 
  head(15) %>% 
  ggplot() +
  geom_col(
    aes(
      x = total_award_amount,
      y = reorder(org_name, total_award_amount),
    ),
    fill = hue_pal()(3)[1]
  ) +
  scale_x_continuous(
    name = "Total Award Amount",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(10),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  theme(
    # plot.margin = unit(c(0.5,1.5,0.5,0), "cm")
  ) +
  labs(
    title = "Top 15 Lead Reasearch Organisations by Total Amount Awarded"
  ) 

summary_org %>% 
  filter(org_role == "LEAD_RO") %>% 
  arrange(desc(mean_award_amount)) %>% 
  head(15) %>% 
  ggplot() +
  geom_col(
    aes(
      x = mean_award_amount,
      y = reorder(org_name, mean_award_amount),
    ),
    fill = hue_pal()(3)[2]
  ) +
  # scale_x_continuous(
  #   name = "Mean Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  # ) +
  scale_x_log10(
    name = "Mean Award Amount",
    labels = label_dollar(prefix = "£"),
    expand = expansion(mult = c(0, .01)),
    minor_breaks = break_log_cust(0:7)
  ) +
  scale_y_discrete(
    name = ""
  ) +
  theme(
    # plot.margin = unit(c(0.5,1.5,0.5,0), "cm")
  ) +
  labs(
    title = "Top 15 Lead Reasearch Organisations by Mean \nAmount Awarded"
  ) 

summary_org %>% 
  filter(org_role == "LEAD_RO") %>% 
  arrange(desc(number_of_grants)) %>% 
  head(15) %>% 
  ggplot() +
  geom_col(
    aes(
      x = number_of_grants,
      y = reorder(org_name, number_of_grants),
    ),
    fill = hue_pal()(3)[1]
  ) +
  scale_x_continuous(
    name = "Count of Grants Awarded",
    expand = expansion(mult = c(0, .05)),
    minor_breaks = breaks_extended(25)
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 15 Lead Reasearch Organisations by Total Number of \nGrants Awarded"
  )

# Grant Subjects and Topics

grants_x_subjects <- datasets$grants %>% 
  left_join(datasets$subject_weight, by = "grant_ref") %>% 
  left_join(datasets$subjects, by = "subject_id") %>% 
  select(
    grant_ref,
    subject,
    percentage,
    award_value
  ) %>% 
  replace_na(
    replace = list(
      percentage = 100
    )
  ) %>% 
  mutate(
    percentage = percentage/100,
    weighted_award_value = percentage * award_value
  )


grants_x_subjects_summary <- grants_x_subjects %>% 
  group_by(subject) %>% 
  summarise(
    total_weighted_award_value = sum(weighted_award_value),
    mean_percentage = mean(percentage),
    mean_weighted_award_value = mean(weighted_award_value),
    total_number_of_grants = n()
  )

grants_x_subjects_summary
grants_x_topics <- datasets$grants %>% 
  left_join(datasets$topic_weight, by = "grant_ref") %>% 
  left_join(datasets$topics, by = "topic_id") %>% 
  select(
    grant_ref,
    topic,
    percentage,
    award_value
  ) %>% 
  replace_na(
    replace = list(
      topic = "Unclassified",
      percentage = 100
    )
  ) %>% 
  mutate(
    percentage = percentage/100,
    weighted_award_value = percentage * award_value
  )


grants_x_topics_summary <- grants_x_topics %>% 
  group_by(topic) %>% 
  summarise(
    total_weighted_award_value = sum(weighted_award_value),
    mean_percentage = mean(percentage),
    mean_weighted_award_value = mean(weighted_award_value),
    total_number_of_grants = n()
  )

grants_x_topics_summary

What is the distribution of funding provided to each Subject and Topic?

Subjects

grants_x_subjects_summary %>% 
  ggplot() +
  geom_density(
    aes(
      x = total_weighted_award_value,
      col = "total_weighted_award_value",
      ..count..
    ),
    # col = hue_pal()(3)[1],
    size = 0.75
  ) +
  geom_density(
    aes(
      x = mean_weighted_award_value,
      col = "mean_weighted_award_value",
      ..count..
    ),
    # col = hue_pal()(3)[2],
    size = 0.75
  ) +
  scale_x_continuous(
    name = "Weighted Amount Awarded",
    labels = label_dollar(prefix = "£"),
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
    minor_breaks = break_log_cust(0:9),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_color_discrete(
    name = NULL,
    labels = c("Mean", "Total")
  ) +
  labs(
    title = "Distribution of Weighted Amount Awarded to Each Subject"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.justification=c(1,1), 
    legend.position=c(1,1)
  )

grants_x_subjects_summary %>% 
  ggplot() +
  geom_density(
    aes(
      x = total_number_of_grants,
      ..count..
    ),
    col = hue_pal()(3)[3],
    size = 0.75
  ) +
  scale_x_continuous(
    name = "Number of Grants",
    labels = number,
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
    minor_breaks = break_log_cust(0:9),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  labs(
    title = "Distribution of Total Number of Grants Under Each Subject",
    subtitle = "Including grants with a subject of NA"
  )

Topics

grants_x_topics_summary %>% 
  ggplot() +
  geom_density(
    aes(
      x = total_weighted_award_value,
      col = "total_weighted_award_value",
      ..count..
    ),
    # col = hue_pal()(3)[1],
    size = 0.75
  ) +
  geom_density(
    aes(
      x = mean_weighted_award_value,
      col = "mean_weighted_award_value",
      ..count..
    ),
    # col = hue_pal()(3)[2],
    size = 0.75
  ) +
  scale_x_continuous(
    name = "Weighted Amount Awarded",
    labels = label_dollar(prefix = "£"),
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
    minor_breaks = break_log_cust(0:9),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_color_discrete(
    name = NULL,
    labels = c("Mean", "Total")
  ) +
  labs(
    title = "Distribution of Weighted Amount Awarded to Each Topic"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.justification=c(1,1), 
    legend.position=c(1,1)
  )

grants_x_topics_summary %>% 
  ggplot() +
  geom_density(
    aes(
      x = total_number_of_grants,
      ..count..
    ),
    col = hue_pal()(3)[3],
    size = 0.75
  ) +
  scale_x_continuous(
    name = "Number of Grants",
    labels = number,
    trans = pseudo_log_trans(base = 10),
    breaks = c(0, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
    minor_breaks = break_log_cust(0:9),
    expand = expansion(mult = c(0, .05))
  ) +
  scale_y_continuous(
    name = "Count",
    breaks_extended(10),
    expand = expansion(mult = c(0, .05))
  ) +
  labs(
    title = "Distribution of Total Number of Grants Under Each topic",
    subtitle = "Including grants with a topic of NA"
  )

Which Subjects and Topics have recieved the Most Grant Funding?

Subjects

grants_x_subjects_summary %>% 
  filter(!is.na(subject)) %>%
  arrange(desc(total_weighted_award_value)) %>% 
  head(10)  %>% 
  ggplot() +
  geom_col(
    aes(
      x = total_weighted_award_value,
      y = reorder(subject, total_weighted_award_value),
    ),
    fill = hue_pal()(3)[1]
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Total Weighted Award Amount",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(20),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 10 Subjects by Total Weighted Amount Awarded",
    subtitle = "NA not included"
  ) 

grants_x_subjects_summary %>% 
  filter(!is.na(subject)) %>%
  arrange(desc(mean_weighted_award_value)) %>% 
  head(10)  %>% 
  ggplot() +
  geom_col(
    aes(
      x = mean_weighted_award_value,
      y = reorder(subject, mean_weighted_award_value),
    ),
    fill = hue_pal()(3)[2]
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Mean Weighted Award Amount",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(20),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 10 Subjects by Mean Weighted Amount Awarded",
    subtitle = "NA not included"
  ) 

grants_x_subjects_summary %>% 
  filter(!is.na(subject)) %>%
  arrange(desc(total_number_of_grants)) %>% 
  head(10)  %>% 
  ggplot() +
  geom_col(
    aes(
      x = total_number_of_grants,
      y = reorder(subject, total_number_of_grants),
    ),
    fill = hue_pal()(3)[3]
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Total Number of Grants",
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(20),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 10 Subjects by Total Number of Grants",
    subtitle = "NA not included"
  ) 

### Topic

grants_x_topics_summary %>% 
  filter(topic != "Unclassified") %>%
  arrange(desc(total_weighted_award_value)) %>% 
  head(10)  %>% 
  ggplot() +
  geom_col(
    aes(
      x = total_weighted_award_value,
      y = reorder(topic, total_weighted_award_value),
    ),
    fill = hue_pal()(3)[1]
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Total Weighted Award Amount",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(20),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 10 Topics by Total Weighted Amount Awarded",
    subtitle = "Unclassified topics not included"
  ) 

grants_x_topics_summary %>% 
  filter(topic != "Unclassified") %>%
  arrange(desc(mean_weighted_award_value)) %>% 
  head(10)  %>% 
  ggplot() +
  geom_col(
    aes(
      x = mean_weighted_award_value,
      y = reorder(topic, mean_weighted_award_value),
    ),
    fill = hue_pal()(3)[2]
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Mean Weighted Award Amount",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(20),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 10 Topics by Mean Weighted Amount Awarded",
    subtitle = "Unclassified topics not included"
  ) 

grants_x_topics_summary %>% 
  filter(topic != "Unclassified") %>%
  arrange(desc(total_number_of_grants)) %>% 
  head(10)  %>% 
  ggplot() +
  geom_col(
    aes(
      x = total_number_of_grants,
      y = reorder(topic, total_number_of_grants),
    ),
    fill = hue_pal()(3)[3]
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Total Number of Grants",
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(20),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_discrete(
    name = ""
  ) +
  labs(
    title = "Top 10 Topics by Total Number of Grants",
    subtitle = "Unclassified topics not included"
  ) 

Typically how many People and Organisations are involved in a grant?

Does the Award Amount Change with the Number Grants Completed by Person

grants_x_person_roles <- left_join(datasets$grants, datasets$person_roles, by = "grant_ref") %>% 
  select(
    person_id,
    award_value,
    grant_category,
    start_date,
    role
  ) %>% 
  filter(
    grant_category != "Studentship"
  ) %>% 
  group_by(person_id) %>% 
  mutate(
    grant_number_all = order(order(start_date))
  ) %>% 
  group_by(person_id, grant_category) %>%
  mutate(
    grant_number_category = order(order(start_date))
  ) %>% 
  group_by(person_id, role) %>%
  mutate(
    grant_number_role = order(order(start_date))
  ) %>% 
  group_by(person_id, role, grant_category) %>%
  mutate(
    grant_number_category_and_role = order(order(start_date))
  ) %>% 
  ungroup()
  
  

grants_x_person_roles
# grants_x_person_roles %>% 
#   group_by(
#     grant_number_all
#   ) %>% 
#   summarise(
#     mean_award_value = mean(award_value)
#   ) %>% 
#   ggplot(
#     mapping = aes(
#       x = grant_number_all,
#       y = mean_award_value/1000
#     )
#   ) +
#   geom_line() +
#   scale_y_continuous(
#     name = "Mean Award Value (£k)",
#     labels = number,
#     breaks = breaks_extended(n = 10)
#   ) +
#   scale_x_continuous(
#     name = "Number of Grants completed by Person",
#     breaks = breaks_extended(n = 10),
#     minor_breaks = breaks_width(1)
#   )

grants_x_person_roles %>% 
  group_by(
    grant_number_all
  ) %>% 
  mutate(
    mean_award_value = mean(award_value)
  ) %>% 
  ungroup() %>% 
  ggplot(
    mapping = aes(
      x = grant_number_all,
      y = value,
    )
  ) +
  geom_smooth(aes(y = award_value, col = "Award Value")) + 
  geom_line(aes(y = mean_award_value, col = "Mean Award Value")) +
  scale_y_continuous(
    name = "Award Value",
    labels = label_dollar(prefix = "£"),
    breaks = breaks_extended(n = 10),
    expand = expansion(mult = c(.01, .01))
  ) +
  scale_x_continuous(
    name = "Number of Grants Completed by Person",
    breaks = breaks_extended(n = 10),
    minor_breaks = breaks_width(1),
    expand = expansion(mult = c(0, .01))
  ) + 
  theme(legend.title = element_blank()) +
  labs(
    title = "Change in the Award Value as Number of Grants Awarded to a Person \nIncreases",
    # subtitle = "Unclassified topics not included"
  ) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

grants_x_person_roles %>% 
  ggplot() +
  geom_histogram(
    mapping = aes(
      grant_number_all, 
      fill = "Number of Grants",
    ),
    bins = 40,
    show.legend = FALSE
  ) +
  # scale_x_log10(
  #   name = "Total Weighted Award Amount",
  #   labels = label_dollar(prefix = "£"),
  #   expand = expansion(mult = c(0, .01)),
  #   minor_breaks = break_log_cust(0:9)
  # ) +
  scale_x_continuous(
    name = "Number of Grants Completed by Person",
    breaks = breaks_extended(),
    minor_breaks = breaks_extended(40),
    expand = expansion(mult = c(0, .01))
  ) +
  scale_y_continuous(
    name = "Count",
    expand = expansion(mult = c(0, .05))
  ) +
  labs(
    title = "Distribution of the Number of Grants Awarded to a Person",
    # subtitle = "Unclassified topics not included"
  ) 

grants_x_person_roles %>% 
  mutate(
    role = fct_recode(
      fct_lump_n(fct_explicit_na(role), n = 3),
      "Co-Invesigator" =  "CO_INVESTIGATOR",
      "Principle Invesigator" = "PRINCIPAL_INVESTIGATOR",
      "Training Grant Holder" = "TRAINING_GRANT_HOLDER",
      "Other Roles" = "Other"
    ) 
  ) %>% 
  ggplot() +
  geom_smooth(
    aes(
      x = grant_number_role,
      y = award_value,
      fill = role,
      col = role,
    ),
    show.legend = FALSE
  ) + 
  facet_wrap(
    facets = vars(role),
    scales = "free"
  ) +
  scale_y_continuous(
    name = "Award Value",
    labels = label_dollar(prefix = "£")
  ) +
  scale_x_continuous(
    name = "Number of Grants Completed by Person in that Role",
  ) +
  labs(
    title = "Change in the Award Value as Number of Grants Awarded to a Person \nIncreases",
    subtitle = "Grouped by Person Role"
  ) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

grants_x_person_roles %>% 
  ggplot() +
  geom_smooth(
    aes(
      x = grant_number_role,
      y = award_value,
      fill = grant_category,
      col = grant_category,
    ),
    show.legend = FALSE
  ) + 
  facet_wrap(
    facets = vars(grant_category),
    scales = "free"
  ) +
  scale_y_continuous(
    name = "Award Value",
    labels = label_dollar(prefix = "£")
  ) +
  scale_x_continuous(
    name = "Number of Grants Completed by Person in that Role",
  ) +
  labs(
    title = "Change in the Award Value as Number of Grants Awarded to a Person \nIncreases",
    subtitle = "Grouped by Grant Type"
  ) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'